In [2]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
Out[2]:
In [1]:
import pandas as pd
import numpy as np
In [5]:
tsvpath = '/Users/rcn/Desktop/twitter-analysis/data/tweets.tsv'
In [7]:
twitterData=pd.read_table(tsvpath,
encoding='utf-8',
na_values=['NaN',''],
parse_dates=[1]
)
# Read in TSV and turn off NaN catching to leave in unrecognised genders
In [8]:
twitterData.head()
Out[8]:
In [9]:
twitterData.dtypes
Out[9]:
In [10]:
nTweets = len(twitterData.index)
print "There are", nTweets, "tweets in the full dataset"
In [15]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
%matplotlib inline
In [16]:
twitterData.plot()
Out[16]:
In [17]:
nLanguage = twitterData.Language.value_counts(sort=True, ascending=False, bins=None)
nLanguage[0:10]
Out[17]:
In [18]:
nLanguage = twitterData.Language.value_counts(normalize=True, sort=True, ascending=False, bins=None)
nLanguage[0:10]
Out[18]:
In [20]:
nLocation = twitterData['User Location'].value_counts(normalize=False, sort=True, ascending=False, bins=None)
nLocation[0:15]
Out[20]:
In [37]:
# Getting Vincent ready
vincent.initialize_notebook()
gpBlue='#00aeef'
gpLightGray='#96999b'
gpDarkBlue='#00447c'
gpRed='#cf5c42'
gpBrown='#e1d8ad'
gpPink='#f4d5e3'
gpLightBlue='#e1f4fd'
In [39]:
location_grouped = twitterData.groupby('UNGPLocation')
mean_location_grouped = location_grouped.mean().dropna()
mean_followers = mean_location_grouped.sort('Followers')['Followers']
followersBar = vincent.Bar(mean_followers)
followersBar.axis_titles(x='Country', y='Followers')
from vincent.axes import AxisProperties
from vincent.properties import PropertySet
from vincent.values import ValueRef
for axis in followersBar.axes:
axis.properties = AxisProperties()
for prop in ['ticks', 'axis', 'major_ticks', 'minor_ticks']:
setattr(axis.properties, prop, PropertySet(stroke=ValueRef(value=gpLightGray)))
axis.properties.title = PropertySet(font_size=ValueRef(value=20),
fill=ValueRef(value=gpLightGray))
axis.properties.labels = PropertySet(fill=ValueRef(value=gpLightGray))
followersBar.axes[0].properties.labels.angle = ValueRef(value=0)
followersBar.axes[0].properties.labels.align = ValueRef(value='center')
followersBar.axes[0].properties.title.dy = ValueRef(value=20)
followersBar.scales[2].range = [gpBlue]
followersBar.to_json('../charts/followersBar.json')
followersBar
Out[39]:
In [40]:
location_grouped = twitterData.groupby('UNGPLocation')
mean_location_grouped = location_grouped.mean().dropna()
mean_friends = mean_location_grouped.sort('Friends')['Friends']
friendsBar = vincent.Bar(mean_friends)
friendsBar.axis_titles(x='Country', y='Friends')
for axis in friendsBar.axes:
axis.properties = AxisProperties()
for prop in ['ticks', 'axis', 'major_ticks', 'minor_ticks']:
setattr(axis.properties, prop, PropertySet(stroke=ValueRef(value=gpLightGray)))
axis.properties.title = PropertySet(font_size=ValueRef(value=20),
fill=ValueRef(value=gpLightGray))
axis.properties.labels = PropertySet(fill=ValueRef(value=gpLightGray))
friendsBar.axes[0].properties.labels.angle = ValueRef(value=0)
friendsBar.axes[0].properties.labels.align = ValueRef(value='center')
friendsBar.axes[0].properties.title.dy = ValueRef(value=20)
friendsBar.scales[2].range = [gpDarkBlue]
friendsBar.to_json('../charts/friendsBar.json')
friendsBar
Out[40]:
In [41]:
location_grouped = twitterData.groupby('UNGPLocation')
mean_location_grouped = location_grouped.mean().dropna()
mean_genderProb = mean_location_grouped.sort('UNGPGenderProb')['UNGPGenderProb']
genderProb = vincent.Bar(mean_genderProb)
genderProb.axis_titles(x='Country', y='Average Gender Probablility')
for axis in genderProb.axes:
axis.properties = AxisProperties()
for prop in ['ticks', 'axis', 'major_ticks', 'minor_ticks']:
setattr(axis.properties, prop, PropertySet(stroke=ValueRef(value=gpLightGray)))
axis.properties.title = PropertySet(font_size=ValueRef(value=20),
fill=ValueRef(value=gpLightGray))
axis.properties.labels = PropertySet(fill=ValueRef(value=gpLightGray))
genderProb.axes[0].properties.labels.angle = ValueRef(value=0)
genderProb.axes[0].properties.labels.align = ValueRef(value='center')
genderProb.axes[0].properties.title.dy = ValueRef(value=20)
genderProb.scales[2].range = [gpRed]
genderProb.to_json('../charts/genderProbBar.json')
genderProb
Out[41]:
In [42]:
mpld3.enable_notebook()
gatesCountry = twitterData.UNGPLocation.value_counts(normalize=False, sort=True, ascending=False, bins=None)
gatesCountryFig = gatesCountry.plot(kind='barh', color='#00aeef')
mpld3.display()
In [70]:
import ggplot as gg
(ggplot(gg.aes(x='UNGPLocation'), data=twitterData)
+ gg.geom_bar() + gg.ggtitle("Gates Tweets")
+ gg.labs("Country", "Number of tweets"))
In [77]:
languagePlot = ggplot(aes(x='DataSiftLanguage'), data=twitterData) + geom_bar() + ggtitle("Language Distribution") + labs("Language", "Number of tweets")
languagePlot
In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
Out[1]:
In [ ]: